---
title: "Public Opinion Data Processing"
output: html_notebook
editor_options: 
  chunk_output_type: console
---

This file contains the full script that processes individual polls and generates the final public opinion measure (adjusted for house effects and pooled at the monthly level) used in Ploger et al. "Time-Varying Relationships Between the ‘Sentiment’ of U.S. Public Opinion and News Media." For a plain language description of how these data were processed, see the supplementary materials for the article.

This script generates a slightly different measure to the one used in the manuscript because the manuscript relied partially on proprietary data from Gallup that cannot be shared here. Instead, we provide the raw public opinion data excluding polls shared by Gallup ("plogeretal_ijpor_rawpolls_data_nogallup.csv"). We also provide the original script such that any interested readers can follow all the processing decisions made for the manuscript. Note that, because the data are slightly different from those used when this script was written, some annotations regarding the number or characteristics of polls are slightly inaccurate to the public version of the data.

After processing the public opinion data with this script, we then merged it with monthly media data (also proprietary) to generate "plogeretal_ijpor_sentiment.Rdata", the data set used for analyses and exhibits in "plogeretal_ijpor_analysis_and_figures.Rmd"

```{r setup, include=F, warning=F}
knitr::opts_chunk$set(echo = TRUE)
library(lubridate)
library(dplyr)
```

#cleaning and preparing raw data
```{r loading data}
#Polling data are current to mid 2022. The Ballotpedia archive has a gap between pulls from July 2020 to the beginning of 2021.

d.full <- read.csv("plogeretal_ijpor_rawpolls_data_nogallup.csv")
d.full$N <- as.numeric(d.full$N)

colnames(d.full) <- c("date.range","orig.start","orig.end","mid","days.in.field","pos","neg","mixed","unsure","no.opinion","dont.know","sample.size","moe","survey.mode","data.reported.as","date.structure","mood.operationalization","house","target.population","question","data.source","rep.pos","ind.pos","dem.pos")

#measure of net sentiment
d.full$net <- d.full$pos-d.full$neg
```

```{r cleaning labels for data sources}
#Combining labels for UG RA's data collections and multiple pulls from Ballotpedia and RCP
d.full$data.source <- as.factor(d.full$data.source)
levels(d.full$data.source)[levels(d.full$data.source)=="First data collection" | levels(d.full$data.source)=="Second data collection"] <- "RA data"
levels(d.full$data.source)[levels(d.full$data.source)=="Ballotpedia" | levels(d.full$data.source)=="Second Ballotpedia"] <- "Ballotpedia"
levels(d.full$data.source)[levels(d.full$data.source)=="Real clear politics" | levels(d.full$data.source)=="Second Real clear politics"] <- "Real clear politics"
```

```{r cleaning labels for polling houses}
#Saving original house data as a separate variable
d.full$house.messy <- d.full$house

#Changing house to a factor for next step
d.full$house <- as.factor(d.full$house)

#Condensing the total list of houses into a variable that indicates which house actually conducted the polls (as accurately as possible). So, AP-GfK becomes GfK.
levels(d.full$house)[levels(d.full$house)=="ABC News/Wash Post" | levels(d.full$house)=="ABC News/Washington" | levels(d.full$house)=="Washington Post/ABC" | levels(d.full$house)=="ABC News/Washington Post Poll"] <- "ABC/WaPo"
levels(d.full$house)[levels(d.full$house)=="Allstate/Atlantic Media Heartland Monitor Poll conducted by FTI Consulting"] <- "FTI"
levels(d.full$house)[levels(d.full$house)=="AP-GfK Poll conducted by GfK Roper Public Affairs & Corporate Communications" | levels(d.full$house)=="Associated Press/GfK" | levels(d.full$house)=="Associated Press-GfK"] <- "GfK"
levels(d.full$house)[levels(d.full$house)=="AP-Ipsos" | levels(d.full$house)=="Associated Press-Ipsos poll conducted by Ipsos Public Affairs" | levels(d.full$house)=="Ipsos-McClatchy" | levels(d.full$house)=="Ipsos/McClatchy" | levels(d.full$house)=="Reuters/Ipsos" | levels(d.full$house)=="Reuters/IPSOS"] <- "Ipsos"
levels(d.full$house)[levels(d.full$house)=="Associated Press/CNBC"] <- "AP/CNBC"
levels(d.full$house)[levels(d.full$house)=="Ayres McHenry (R)"] <- "Ayres McHenry (R)"
levels(d.full$house)[levels(d.full$house)=="Battleground" | levels(d.full$house)=="GU Politics/Battleground" | levels(d.full$house)=="GW/Battleground" | levels(d.full$house)=="GWU/Battleground" | levels(d.full$house)=="Politico/GWU/Battleground"] <- "Battleground"
levels(d.full$house)[levels(d.full$house)=="Bloomberg"] <- "Bloomberg"
levels(d.full$house)[levels(d.full$house)=="Bloomberg National Poll conducted by Selzer & Company"] <- "Selzer & Co"
levels(d.full$house)[levels(d.full$house)=="CBS News" | levels(d.full$house)=="CBS News Poll"] <- "CBS"
levels(d.full$house)[levels(d.full$house)=="CBS News/NY Times" | levels(d.full$house)=="CBS News/NYT" | levels(d.full$house)=="CBS/NY Times"] <- "CBS/NYT"
levels(d.full$house)[levels(d.full$house)=="CNBC/Change Research (D)"] <- "Change Research (D)"
levels(d.full$house)[levels(d.full$house)=="CNN"] <- "CNN"
levels(d.full$house)[levels(d.full$house)=="CNN/SSRS"] <- "SSRS"
levels(d.full$house)[levels(d.full$house)=="CNN/USA Today/Gallup" | levels(d.full$house)=="USA Today/Gallup" | levels(d.full$house)=="Gallup"] <- "Gallup"
levels(d.full$house)[levels(d.full$house)=="Cook/RT Strategies"] <- "RT Strategies"
levels(d.full$house)[levels(d.full$house)=="Democracy Corps (D)"] <- "Democracy Corps (D)"
levels(d.full$house)[levels(d.full$house)=="Diageo/Hotline" | levels(d.full$house)=="Hotline/FD"] <- "Financial Dynamics"
levels(d.full$house)[levels(d.full$house)=="Economist/YouGov" | levels(d.full$house)=="The Economist/YouGov"] <- "YouGov"
levels(d.full$house)[levels(d.full$house)=="Emerson" | levels(d.full$house)=="Emerson College"] <- "Emerson College"
levels(d.full$house)[levels(d.full$house)=="FDU/Public Mind"] <- "Public Mind"
levels(d.full$house)[levels(d.full$house)=="Fox News" | levels(d.full$house)=="FOX News"] <- "FOX News"
levels(d.full$house)[levels(d.full$house)=="FOX News/Opinion Dynamics Poll"]  <- "Opinion Dynamics"
levels(d.full$house)[levels(d.full$house)=="Fox News Poll conducted by Anderson Robbins Research (D) and Shaw & Company Research (R) (Conducted by Opinion Dynamics Corp. 1/11 and earlier)"] <- "Anderson Robbins (D), Shaw & Company (R)"
d.full$house[d.full$house=="Anderson Robbins (D), Shaw & Company (R)" & d.full$start<"2011-01-01"] <- "Opinion Dynamics" #These polls would be categorized as "Anderson Robbins (D), Shaw & Company (R)", but were collected in January 2011 or earlier, so are instead categorized as "Opinion Dynamics".
levels(d.full$house)[levels(d.full$house)=="Harris" | levels(d.full$house)=="Harvard-Harris"] <- "Harris"
levels(d.full$house)[levels(d.full$house)=="Hearst/F&M"] <- "F&M"
levels(d.full$house)[levels(d.full$house)=="IBD/TIPP"] <- "IBD/TIPP"
levels(d.full$house)[levels(d.full$house)=="LA Times/Bloomberg" | levels(d.full$house)=="Los Angeles Times/Bloomberg Poll"] <- "LA Times/Bloomberg"
levels(d.full$house)[levels(d.full$house)=="Marist" | levels(d.full$house)=="Marist/PBS" | levels(d.full$house)=="McClatchy/Marist" | levels(d.full$house)=="NBC News/Marist" | levels(d.full$house)=="NPR/PBS NewsHour/Marist" | levels(d.full$house)=="NPR/PBS NewsHours/Marist Poll" | levels(d.full$house)=="NPR/PBS/Marist" | levels(d.full$house)=="WNBC/Marist" | levels(d.full$house)=="PBS/Marist"] <- "Marist"
levels(d.full$house)[levels(d.full$house)=="Monmouth" | levels(d.full$house)=="Monmouth University"] <- "Monmouth University"
levels(d.full$house)[levels(d.full$house)=="National Journal"] <- "National Journal"
levels(d.full$house)[levels(d.full$house)=="National Journal/FD"] <- "National Journal/FD"
levels(d.full$house)[levels(d.full$house)=="NBC News"] <- "NBC"
levels(d.full$house)[levels(d.full$house)=="NBC News/Wall St. Jrnl" | levels(d.full$house)=="NBC/WSJ" | levels(d.full$house)=="Wall Street Journal/NBC"] <- "NBC/WSJ"
levels(d.full$house)[levels(d.full$house)=="NBC News/Wall Street Journal/Hart Research Associates/Public Opinion Strategies"] <- "Hart (D), POS (R)"
levels(d.full$house)[levels(d.full$house)=="New York Times"] <- "NYT"
levels(d.full$house)[levels(d.full$house)=="Newsweek"] <- "Newsweek"
levels(d.full$house)[levels(d.full$house)=="Newsweek/Daily Beast"] <- "Newsweek/Daily Beast"
levels(d.full$house)[levels(d.full$house)=="Newsweek/Princeton Survey Research Associates International"] <- "PSRAI"
levels(d.full$house)[levels(d.full$house)=="NPR"] <- "NPR"
levels(d.full$house)[levels(d.full$house)=="NPR - POS/GQR" | levels(d.full$house)=="POS/GQR"] <- "POS/GQR"
levels(d.full$house)[levels(d.full$house)=="NPR/GQR/Resurgent"] <- "GQR"
levels(d.full$house)[levels(d.full$house)=="NY Times/Siena"] <- "SCRI"
levels(d.full$house)[levels(d.full$house)=="Pew (American Trends Panel)" | levels(d.full$house)=="Pew Research"] <- "Pew"
levels(d.full$house)[levels(d.full$house)=="Politico/Morning Consult"] <- "Morning Consult"
levels(d.full$house)[levels(d.full$house)=="POS (R)"] <- "POS (R)"
levels(d.full$house)[levels(d.full$house)=="Quinnipiac" | levels(d.full$house)=="Quinnipiac University Pol"] <- "Quinnipiac University"
levels(d.full$house)[levels(d.full$house)=="R2000/Daily Kos (D)**"] <- "Research 2000"
levels(d.full$house)[levels(d.full$house)=="Rasmussen Reports" | levels(d.full$house)=="Scott Rasmussen" | levels(d.full$house)=="Rasmussen Reports/Pulse Opinion Research"] <- "Rasmussen"
levels(d.full$house)[levels(d.full$house)=="Reason-Rupe/PSRAI"] <- "Reason-Rupe/PSRAI"
levels(d.full$house)[levels(d.full$house)=="Resurgent Republic (R)"] <- "Resurgent Republic (R)" 
levels(d.full$house)[levels(d.full$house)=="Suffold University/USA Today Poll" | levels(d.full$house)=="USA Today/Suffolk" | levels(d.full$house)=="USA Today/Suffolk University"] <- "Suffolk University"
levels(d.full$house)[levels(d.full$house)=="The Atlantic"] <- "Atlantic"
levels(d.full$house)[levels(d.full$house)=="Time" | levels(d.full$house)=="Time Poll"] <- "Time"
levels(d.full$house)[levels(d.full$house)=="Wall Street Journal"] <- "WSJ"
levels(d.full$house)[levels(d.full$house)=="Washington Post"] <- "WaPo"
levels(d.full$house)[levels(d.full$house)=="Westhill Partners/Hotline"] <- "Westhill/Hotline"
levels(d.full$house)[levels(d.full$house)=="Winning the Issues"] <- "Winning the Issues"
levels(d.full$house)[levels(d.full$house)=="Winston Group (R)"] <- "Winston Group (R)"
levels(d.full$house)[levels(d.full$house)=="Zogby"] <- "Zogby"
```

```{r omitting polls from Research 2000}
#Deleting polls collected by Research 2000, which were (allegedly) fabricated.
d.full <- d.full[d.full$house!="Research 2000",]
```

```{r creating date and time variables}
#Using lubridate to set up date variables. This code reads in any value without a letter in the start date as a date in m/d/y format and anything with a letter in the start date as a date in y/m/d format because I've standardized the data to have everything with letters be y/m/d and everything without letters be m/d/y.

#This accounts for polls that only report month and year because their start and end dates were manually imputed in m/d/y format without a letter

d.full$start <- NA
d.full$start[!grepl("[a-z]",as.character(d.full$orig.start))] <- mdy(d.full$orig.start[!grepl("[a-z]",as.character(d.full$orig.start))])
d.full$start[grepl("[a-z]",as.character(d.full$orig.start))] <- ymd(d.full$orig.start[grepl("[a-z]",as.character(d.full$orig.start))])

d.full$end <- NA
d.full$end[!grepl("[a-z]",as.character(d.full$orig.end))] <- mdy(d.full$orig.end[!grepl("[a-z]",as.character(d.full$orig.end))])
d.full$end[grepl("[a-z]",as.character(d.full$orig.end))] <- ymd(d.full$orig.end[grepl("[a-z]",as.character(d.full$orig.end))])

d.full$start <- as_date(d.full$start)
d.full$end <-as_date(d.full$end)

#populating a variable encoding the middle day of fielding (mid date) by adding half of the time between end and start dates to the start date. if a study was in the field for an odd number of days, this rounds up. e.g., the mid date for a survey fielded from 2007-07-11 to 2007-07-12 will be 2007-07-12, and the mid date for a survey fielded from 1979-02-02 to 1979-02-05 will be 1979-02-04.
d.full$mid <- d.full$start+(d.full$end-d.full$start)/2

#generating measures of day, week, and month of mid date
#day
tmp <- as.numeric(d.full$mid)
d.full$mid.day.num <- tmp/365+1970 
d.full$mid.day.fac <- as.factor(d.full$mid)

#week
d.full$mid.week.num <- (week(d.full$mid)-1)/52+year(d.full$mid)
d.full$mid.week.fac <- as.factor(d.full$mid.week.num)

#month
d.full$mid.month.num <- (month(d.full$mid)-1)/12+year(d.full$mid)
d.full$mid.month.fac <- as.factor(d.full$mid.month.num)
```

```{r counting days each poll was in the field}
#A few polls have discontinuous polling periods. I manually counted the number of days those polls were fielded while collecting the polls, and I use those manually created values as a flag here for discontinuous polling periods.
d.full$continuous <- "Yes"
d.full$continuous[!is.na(d.full$days.in.field)] <- "No"

#For days with continuous polling periods, Subtracting the start date from the end date to generate a days in field variable, then adding one so that polls that started and ended on the same day were fielded for a day, polls started on a day and ended on the next were fielded for two, etc. 
d.full$days.in.field[d.full$continuous=="Yes"] <- d.full$end[d.full$continuous=="Yes"]-d.full$start[d.full$continuous=="Yes"]+1
d.full$days.in.field <- as.numeric(d.full$days.in.field)
```

```{r accounting for incomplete or nonstandard dates}
#Some polls are reported with incomplete date information. For instance, Reuters/Ipsos polls don't report individual start or end dates. Instead, they conduct at least one poll a week and release the results on Sundays. Because those account for only some of the polls Ipsos conducted, I flag them with a different date structure ("Week ending on date"). Other polls only report the month during which the poll was conducted (flagged "Month and Year").

#I omit all the polls with incomplete date information
d.full <- d.full[d.full$date.structure=="Date range",]
```

```{r manually coding preferred source to omit potential duplicated polls}
#Manual preferred source coding so that data comes from Real clear politics or Ballotpedia depending on which reports more polls. If neither reports any, we use data from the RA. In general, RCP reports the most.
d.full$man.pref.source <- NA
d.full$man.pref.source <- "Real clear politics"

d.full$man.pref.source[d.full$house=="Morning Consult"] <- "Ballotpedia"

d.full$man.pref.source[d.full$house=="FTI" | d.full$house=="Selzer & Co" | d.full$house=="SSRS" | d.full$house=="Anderson Robbins (D), Shaw & Company (R)" | d.full$house=="Hart (D), POS (R)" | d.full$house=="PSRAI" | d.full$house=="Winning the Issues" | d.full$house=="Gallup"] <- "RA data"

#omitting data from non-preferred sources
d.red.man.prefsource <- d.full[d.full$data.source==d.full$man.pref.source,]
```

```{r removing exact duplicates}
#Removing exact duplicates on key variables EXCLUDING HOUSE to account for possibilities in different formats for reporting houses (e.g., "Hart(D), POS (R)" vs. "Hart (D)")
d.red.man.prefsource.distinct <- distinct(d.red.man.prefsource,d.red.man.prefsource$start,d.red.man.prefsource$end,d.red.man.prefsource$pos,d.red.man.prefsource$neg,.keep_all=T) 
d.red.man.prefsource.distinct <- select(d.red.man.prefsource.distinct,-c("d.red.man.prefsource$start","d.red.man.prefsource$end","d.red.man.prefsource$pos","d.red.man.prefsource$neg"))
```

```{r populating sample size data}
table(is.na(d.red.man.prefsource.distinct$sample.size))

median(d.red.man.prefsource.distinct$sample.size,na.rm=T)
mean(d.red.man.prefsource.distinct$sample.size,na.rm=T)

#Sample size data is missing for 1741 of 3724 polls. Of the 1983 polls for which we have sample size, the median N is 1225 and the mean is 1427.

#median imputation of sample size
assume.n <- median(d.red.man.prefsource.distinct$sample.size,na.rm=T)
d.red.man.prefsource.distinct$sample.size[is.na(d.red.man.prefsource.distinct$sample.size)] <- assume.n
```

```{r subsetting to polls starting after 2000 and before august 2021}
#the polling data is very sparse before 2000 and we don't have media data after july 2021, so I'm excluding all data before/after those cutoffs 

d.2000 <- d.red.man.prefsource.distinct[d.red.man.prefsource.distinct$start>="2000-01-01",]
d.2000 <- d.2000[d.2000$end<"2021-08-01",]
rm(d.red.man.prefsource,d.red.man.prefsource.distinct,d.full)
```

#adjusting polls for house effects
```{r adjusting net mood for house effects (mid date)}
m.mid.net <- lm(net ~ mid.day.fac + house -1, data=d.2000) #estimate model without intercept, using a factor for date to estimate different coefficients for each day. this essentially treats the first house (ABC/WaPo) as the baseline to which other house effects are compared

C <- as.data.frame(m.mid.net$coefficients) #extracting all coefficients
colnames(C) <- c("coef")
C$name <- rownames(C) 
rownames(C) <- NULL

H.mid.net <- C[grepl("house",C$name)==T,] #extracting just house effects
H.mid.net$name <- gsub("house","",H.mid.net$name)
H.mid.net$coef[is.na(H.mid.net$coef)] <- 0 #some coefficients don't estimate due to low numbers of polls. i assign those 0, since they're not associated with any movement in polls above and beyond what's accounted for by date

tmphousevar <- data.frame("coef"=0,"name"="ABC/WaPo")
H.mid.net <- rbind(tmphousevar,H.mid.net) #adding an extra row corresponding to the house that was used as the baseline in the model and assigning it a coefficient of 0 (because other coefficients are relative to it being 0)
rm(tmphousevar)

houserecenter.mid.net <- median(H.mid.net$coef) #identifying the median house effect (corresponding to Marist) for use later

d.2000$house.adjust.mid.net <- 0
d.2000$net.adjust.mid <- 0
for(i in unique(d.2000$house)){
  adjustment <- H.mid.net[H.mid.net$name==i,]$coef
  d.2000[d.2000$house==i,]$house.adjust.mid.net <- adjustment
  d.2000[d.2000$house==i,]$net.adjust.mid <- d.2000[d.2000$house==i,]$net - adjustment #subtracting the adjustment (ie the house effect) because if a house tends to be overly pessimistic, we want to reverse that and bring it up closer to the 'real' value
}

d.2000$net.adjust.mid <- d.2000$net.adjust.mid+houserecenter.mid.net #finally, adding the median house effect to the adjusted estimates

rm(adjustment,i)
```

#pooling data
```{r allocating observations to each day in the field}
#Creating the dataset for pooling
subset <- d.2000
subset.new <- subset[0,]

for (i in 1:nrow(subset)) {
  sub.mood <- subset[i,]
  days.fielded <- sub.mood$days.in.field
  new.obs <- sub.mood$sample.size/days.fielded
  sub.mood.new <- sub.mood[0,]
  
  for(j in 1:days.fielded){
    sub.mood.new.idx <- sub.mood
    sub.mood.new.idx$observations <- new.obs
    sub.mood.new.idx$ED <- sub.mood.new.idx$end - j + 1
    sub.mood.new <- rbind(sub.mood.new,sub.mood.new.idx)
  }
  subset.new <- rbind(subset.new,sub.mood.new)
}

d.2000.pooled <- subset.new
rm(subset.new,subset,sub.mood,sub.mood.new,sub.mood.new.idx,new.obs,i,j,days.fielded)

#Manually correcting ED variable for polls with noncontinuous polling periods

#Ordering the data so it's consistent with the order of the fixed dates
d.2000.pooled <- d.2000.pooled[order(d.2000.pooled$mid),]

#View(d.2000.pooled[d.2000.pooled$continuous=="No",c("date.range","start","end","ED","observations","sample.size","continuous","days.in.field","house")])

noncontinuous.fix <- c("2008-11-28","2008-11-26","2008-11-25",
                       "2008-11-29","2008-11-28","2008-11-26",
                       "2008-12-26","2008-12-22","2008-12-21",
                       "2008-12-27","2008-12-26","2008-12-22",
                       "2009-01-02","2009-01-01","2008-12-29","2008-12-28",
                       "2009-01-03","2009-01-02","2009-01-01",
                       "2009-02-14","2009-02-12","2009-02-11",
                       "2009-02-15","2009-02-14","2009-02-12",
                       "2009-04-13","2009-04-11","2009-04-10",
                       "2009-04-14","2009-04-13","2009-04-11",
                       "2009-05-26","2009-05-24","2009-05-23",
                       "2009-05-27","2009-05-26","2009-05-24",
                       "2009-06-20","2009-06-18","2009-06-17",
                       "2009-06-21","2009-06-20","2009-06-18",
                       "2009-07-05","2009-07-03","2009-07-02",
                       "2009-07-06","2009-07-05","2009-07-03")

noncontinuous.fix <- as.Date(noncontinuous.fix)  
d.2000.pooled$ED[d.2000.pooled$continuous=="No"] <- noncontinuous.fix
rm(noncontinuous.fix)
```

```{r pooling polls by month}
#Creating an index for month of ED
d.2000.pooled$ED.month <- (month(d.2000.pooled$ED)-1)/12+year(d.2000.pooled$ED)

d.2000.pooled <- d.2000.pooled[order(d.2000.pooled$ED.month),]
subset <- d.2000.pooled
dates <- unique(subset$ED.month)

pooled.polls.month <- data.frame("net.pooled" = NA, 
                                 "net.adjust.mid.pooled" = NA, "date" = NA)[0,]

for (i in 1:(length(dates))){
  date <- dates[i]
  mood <- subset[subset$ED.month == date,]

#Unadjusted net mood
  new.net <- 0
  for (j in 1:nrow(mood)){
    new.net <- new.net + mood$observations[j] * mood$net[j]
  }
  new.net <- new.net/sum(mood$observations)
    
  
#net mood adjusted for general house effects (mid date)
new.net.adjust.mid <- 0
  for (j in 1:nrow(mood)){
    new.net.adjust.mid <- new.net.adjust.mid + mood$observations[j] * mood$net.adjust.mid[j]
  }
  new.net.adjust.mid <- new.net.adjust.mid/sum(mood$observations)
  
  pooled.polls.month <- rbind(pooled.polls.month,data.frame("net.pooled" = new.net,
                                                            "net.adjust.mid.pooled" = new.net.adjust.mid, 
                                                            "date" = date))
}


pooled.polls.month <- pooled.polls.month[order(pooled.polls.month$date),]
```